Sumit (8977542)

Paras Rupani (8961758)

Asif Afzal (8764552)

Importing Modules¶

In [1]:
import numpy as np 
import pandas as pd 
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
In [3]:
data = pd.read_csv(r'./laptop.csv')
data.sample(3)
Out[3]:
Unnamed: 0 Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price
680 680 MSI Gaming 15.6 Full HD 1920x1080 Intel Core i5 7300HQ 2.5GHz 8GB 256GB SSD Nvidia GeForce GTX 1050 Windows 10 2.2kg 53168.1120
783 783 Lenovo Notebook 15.6 1366x768 Intel Celeron Quad Core N3710 1.6GHz 8GB 1TB HDD Intel HD Graphics 405 No OS 2.2kg 17529.1200
715 715 Lenovo Notebook 14.0 IPS Panel Full HD 1920x1080 Intel Core i5 7200U 2.5GHz 8GB 256GB SSD Nvidia GeForce 920MX Windows 10 1.87kg 45664.6896
In [4]:
data.columns
Out[4]:
Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

Data Analysis¶

Which brand is the most frequent in the dataframe?¶

In [5]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
company_counts = data['Company'].value_counts().reset_index()
company_counts.columns = ['Company', 'Count']

fig = px.bar(company_counts, x='Company', y='Count',
             labels={'Company': 'Company', 'Count': 'Number of Laptops'},
             color='Company',
             title='Number of laptops by brands')
fig.update_layout(xaxis={'tickangle': 80})
fig.show()

Most of laptops are from Dell, Lenovo and HP.

What type of laptop is the most frequent?¶

In [6]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
type_counts = data['TypeName'].value_counts().reset_index()
type_counts.columns = ['TypeName', 'Count']

fig = px.pie(type_counts, values='Count', names='TypeName', title='Laptop types')
fig.show()
In [7]:
print('Most laptops are notebooks, which make {:.2f}% of the total laptops'.format(len(data[data['TypeName']=='Notebook'])*100/len(data)))
Most laptops are notebooks, which make 55.79% of the total laptops

Which size is the most popular?¶

In [8]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
inch_counts = data['Inches'].value_counts().reset_index()
inch_counts.columns = ['Inches', 'Count']

# Plot the screen size distribution
fig = px.bar(inch_counts, x='Inches', y='Count', 
             labels={'Inches': 'Screen Size (inches)', 'Count': 'Number of Laptops'},
             title='Laptop Screen Size Distribution')

# Set a narrower range for the x-axis
fig.update_layout(xaxis=dict(title='Screen Size (inches)', tickangle=45, range=[min(inch_counts['Inches']) - 0.5, max(inch_counts['Inches']) + 0.5]),
                  yaxis=dict(title='Number of Laptops'))

fig.show()
In [9]:
print('Most laptops have 15.6 inches, which make {:.2f}% of the total laptops'.format(len(data[data['Inches']==15.6])*100/len(data)))
Most laptops have 15.6 inches, which make 51.04% of the total laptops

How is weight distributed among the laptops?¶

In [10]:
import plotly.express as px

def num_plot(data, column, title, unit):
    # Sort the DataFrame by the specified column
    sorted_data = data.sort_values(by=column)
    
    # Plot the histogram
    fig = px.histogram(sorted_data, x=column, 
                       labels={column: f'{column} ({unit})', 'count': 'Number of Laptops'},
                       title=title)
    fig.update_layout(xaxis={'title': f'{column} ({unit})'},
                      yaxis={'title': 'Number of Laptops'})
    fig.show()

# Assuming your DataFrame containing the data is named 'data'
num_plot(data, 'Weight', 'Weight Distribution', 'kg')
In [11]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

def num_plot(data, column, title, unit):
    # Plot the histogram using Seaborn's countplot
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.countplot(x=column, data=data, ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_xlabel(f'{title} ({unit})')
    plt.title(title)
    plt.show()

# Assuming your DataFrame containing the data is named 'data'
num_plot(data, 'Memory', 'First Hard Drive Capacity Distribution', 'GB')
No description has been provided for this image

What is the average price of laptops by company?¶

In [12]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
company_list = data['Company'].unique()

filtered_data = data[data['Company'].isin(company_list)]
average_price_by_company = filtered_data.groupby('Company')['Price'].mean().reset_index()

fig = px.bar(average_price_by_company, x='Company', y='Price', 
             title='Average price of laptops by company',
             labels={'Company': 'Company', 'Price': 'Average Price'},
             color='Company',
             color_discrete_sequence=px.colors.qualitative.Pastel,
             text='Price')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(xaxis={'tickangle': 80})
fig.show()

Data preprocessing¶

1. Removing column 'Unnamed: 0¶

In [13]:
data.drop(columns='Unnamed: 0', inplace=True)
data.sample(3)
Out[13]:
Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price
771 Samsung 2 in 1 Convertible 15.0 Full HD / Touchscreen 1920x1080 Intel Core i7 7500U 2.7GHz 16GB 256GB SSD AMD Radeon 540 Windows 10 1.71kg 95850.72
407 Dell Ultrabook 14.0 Full HD 1920x1080 Intel Core i7 7600U 2.8GHz 8GB 512GB SSD Intel HD Graphics Windows 10 1.36kg 89510.40
564 Asus Notebook 15.6 Full HD 1920x1080 Intel Core i5 7300HQ 2.5GHz 8GB 128GB SSD + 1TB HDD Nvidia GeForce GTX 1050 Windows 10 2.5kg 44701.92

Checking for 'null values' & 'duplicate rows' in dataset¶

In [14]:
def count_percent(data):
    data_columns = pd.DataFrame({'Count Missing': data.isnull().sum(),
                                'Percent Missing': data.isnull().sum()*100/data.shape[0],
                                'Count Duplicate Rows': data.duplicated().sum(),
                                'Percent Duplicate Rows': data.duplicated().sum()*100/data.shape[0]
                                })
    return data_columns
count_percent(data) 
Out[14]:
Count Missing Percent Missing Count Duplicate Rows Percent Duplicate Rows
Company 0 0.0 29 2.225633
TypeName 0 0.0 29 2.225633
Inches 0 0.0 29 2.225633
ScreenResolution 0 0.0 29 2.225633
Cpu 0 0.0 29 2.225633
Ram 0 0.0 29 2.225633
Memory 0 0.0 29 2.225633
Gpu 0 0.0 29 2.225633
OpSys 0 0.0 29 2.225633
Weight 0 0.0 29 2.225633
Price 0 0.0 29 2.225633

There are 29 duplicate rows.

In [15]:
count_non_duplicate_rows = data.duplicated(keep=False).sum() - data.duplicated().sum()
print("Out of {0} rows, {1} rows are original and {2} are duplicate rows.".format(data.duplicated(keep=False).sum(),count_non_duplicate_rows,data.duplicated().sum()))
Out of 43 rows, 14 rows are original and 29 are duplicate rows.

Dropping duplicate rows¶

In [16]:
data.drop_duplicates(inplace=True)
print("Duplicate Rows: ",data.duplicated().sum())
print(data.shape)
count_percent(data)
Duplicate Rows:  0
(1274, 11)
Out[16]:
Count Missing Percent Missing Count Duplicate Rows Percent Duplicate Rows
Company 0 0.0 0 0.0
TypeName 0 0.0 0 0.0
Inches 0 0.0 0 0.0
ScreenResolution 0 0.0 0 0.0
Cpu 0 0.0 0 0.0
Ram 0 0.0 0 0.0
Memory 0 0.0 0 0.0
Gpu 0 0.0 0 0.0
OpSys 0 0.0 0 0.0
Weight 0 0.0 0 0.0
Price 0 0.0 0 0.0
  • 'Unnamed: 0' column is removed.

  • We found 29 duplicated and rows and removed it.

  • Now there are no more 'duplicate rows' and 'null values' in the dataset.

using wishker approach to decide Budget , mid-range , expensive Laptop prices¶

In [17]:
min_price = data['Price'].min()
max_price = data['Price'].max()
median_price = data['Price'].median()

print("Minimum Price:", min_price)
print("Maximum Price:", max_price)
print("Median Price:", median_price)
Minimum Price: 9270.72
Maximum Price: 324954.72
Median Price: 52693.92

Selecting Budget Laptops, Mid-range, expensive Price Range¶

In [18]:
labels = ['budget', 'Mid-range', 'expensive']

# Split the 'Price' column into categories using quartiles and assign labels
data['Price_Category'] = pd.qcut(data['Price'], q=[0, 0.33, 0.66, 1], labels=labels)

# Display the first few rows of the DataFrame with the new 'Price_Category' column
print(data[['Price', 'Price_Category']].head())
         Price Price_Category
0   71378.6832      expensive
1   47895.5232      Mid-range
2   30636.0000         budget
3  135195.3360      expensive
4   96095.8080      expensive

Converting String Value Present in Ram, Memory, Weight into Numerical value¶

In [19]:
data['Ram'] = data['Ram'].str.extract('(\d+)').astype(float)
data['Memory'] = data['Memory'].str.extract('(\d+)').astype(float)
data['Weight'] = data['Weight'].str.extract('(\d+.\d+)').astype(float)
In [20]:
data.head()
Out[20]:
Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price Price_Category
0 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 2.3GHz 8.0 128.0 Intel Iris Plus Graphics 640 macOS 1.37 71378.6832 expensive
1 Apple Ultrabook 13.3 1440x900 Intel Core i5 1.8GHz 8.0 128.0 Intel HD Graphics 6000 macOS 1.34 47895.5232 Mid-range
2 HP Notebook 15.6 Full HD 1920x1080 Intel Core i5 7200U 2.5GHz 8.0 256.0 Intel HD Graphics 620 No OS 1.86 30636.0000 budget
3 Apple Ultrabook 15.4 IPS Panel Retina Display 2880x1800 Intel Core i7 2.7GHz 16.0 512.0 AMD Radeon Pro 455 macOS 1.83 135195.3360 expensive
4 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 3.1GHz 8.0 256.0 Intel Iris Plus Graphics 650 macOS 1.37 96095.8080 expensive

Checking null value in weight¶

In [21]:
null_weight = data['Weight'].isnull().sum()

# Display the number of null values
print("Number of null values in the 'Weight' column:", null_weight)
Number of null values in the 'Weight' column: 50

replacing null value in weight with mean¶

In [22]:
mean_weight = data['Weight'].mean()

data['Weight'].fillna(mean_weight, inplace=True)

printing unique value in TypeName¶

In [23]:
unique_type_names = data['TypeName'].unique()

# Display the unique values
print("Unique values in the 'TypeName' column:")
for value in unique_type_names:
    print(value)
Unique values in the 'TypeName' column:
Ultrabook
Notebook
Netbook
Gaming
2 in 1 Convertible
Workstation

categorical encoding¶

In [24]:
type_name_mapping = {
    'Ultrabook': 0,
    'Notebook': 1,
    'Netbook': 2,
    'Gaming': 3,
    '2 in 1 Convertible': 4,
    'Workstation': 5
}

data['TypeName'] = data['TypeName'].map(type_name_mapping)

Checking Dataset Before deciding Features and Labels¶

In [25]:
data.head()
Out[25]:
Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price Price_Category
0 Apple 0 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 2.3GHz 8.0 128.0 Intel Iris Plus Graphics 640 macOS 1.37 71378.6832 expensive
1 Apple 0 13.3 1440x900 Intel Core i5 1.8GHz 8.0 128.0 Intel HD Graphics 6000 macOS 1.34 47895.5232 Mid-range
2 HP 1 15.6 Full HD 1920x1080 Intel Core i5 7200U 2.5GHz 8.0 256.0 Intel HD Graphics 620 No OS 1.86 30636.0000 budget
3 Apple 0 15.4 IPS Panel Retina Display 2880x1800 Intel Core i7 2.7GHz 16.0 512.0 AMD Radeon Pro 455 macOS 1.83 135195.3360 expensive
4 Apple 0 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 3.1GHz 8.0 256.0 Intel Iris Plus Graphics 650 macOS 1.37 96095.8080 expensive

Correlation Matrix¶

  • The correlation matrix can give some useful informations about the linear correlation between the numerical features
In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select the columns for correlation calculation
selected_columns = ['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']
selected_data = data[selected_columns]

plt.figure(figsize=(6, 5))
sns.heatmap(selected_data.corr(), cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image
  • RAM has a high positive correlation with price (+0.75): more expensive laptops tends to have a higher pric

  • Inches and Weight have a high positive correlation (+0.82) since laptops with bigger screens tend to be heavier.

Define features and target¶

In [27]:
X = data[['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']]
y = data['Company']

Split the dataset into 80% training and 20% testing¶

In [28]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
Training set shape: (1019, 5) (1019,)
Testing set shape: (255, 5) (255,)

Model-1 (RandomForestClassifier)¶

In [29]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Print the training accuracy
training_accuracy = rf_classifier.score(X_train, y_train)
print("Training accuracy:", training_accuracy)
Training accuracy: 0.9018645731108931

Predictions of Company¶

In [30]:
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Print the first few predictions
print("Predictions:", y_pred[:5])

# Print the actual values
print("Actual values:", y_test.values[:5])
Predictions: ['HP' 'Lenovo' 'Lenovo' 'Dell' 'HP']
Actual values: ['HP' 'Dell' 'Xiaomi' 'Dell' 'Dell']
In [31]:
# Print the feature names
print("Feature names:", X.columns.tolist())

# Print the unique values of the target variable
print("Unique company labels:", y.unique())
Feature names: ['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']
Unique company labels: ['Apple' 'HP' 'Acer' 'Asus' 'Dell' 'Lenovo' 'Chuwi' 'MSI' 'Microsoft'
 'Toshiba' 'Huawei' 'Xiaomi' 'Vero' 'Razer' 'Mediacom' 'Samsung' 'Google'
 'Fujitsu' 'LG']

Providing inputs for predictions¶

In [32]:
# Define the specifications for the laptop you want to predict
spec = [16, 512, 15.6, 3, 1]

spec = [spec]
company_pred = rf_classifier.predict(spec)
print("Predicted Company:", company_pred[0])
Predicted Company: Lenovo

Model-2 (LinearRegression)¶

Define features and target variable¶

In [33]:
X = data[['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']]
y = data['Price']
In [34]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import pickle

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Evaluate models using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    print(f"{name}: MAE = {-scores.mean()} (±{scores.std()})")

# Train the best model on the entire training set
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train, y_train)

# Save the best model to a file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
Linear Regression: MAE = 17148.89697478473 (±777.1787229299255)
Decision Tree: MAE = 14358.04686958316 (±1395.4092472105663)
Random Forest: MAE = 12290.942519498396 (±933.7219958842466)
Gradient Boosting: MAE = 12941.940647067655 (±445.63436217606977)

Predictions vs actual values vs company names¶

In [35]:
# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Select the first few predictions, actual values, and company names from the original dataset 'data'
predictions_df = pd.DataFrame({
    'Actual Price': y_test.values[:5],
    'Predicted Price': y_pred[:5],
    'Company': data.loc[X_test.index[:5], 'Company']  # Assuming 'Company' is the column name for company names in 'data'
})

# Print the DataFrame
print(predictions_df)
     Actual Price  Predicted Price Company
309    24988.3200     20353.739791      HP
622    41345.2800     24528.623802    Dell
184    63882.7200     63483.290052  Xiaomi
705    42357.6000     43016.424577    Dell
522    68184.0144     59898.614709    Dell

Random 5 Prediction for Prices¶

In [36]:
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Print the first few predictions
print("Predictions:", y_pred[:5])

# Print the actual values
print("Actual values:", y_test.values[:5])
Predictions: ['HP' 'Lenovo' 'Lenovo' 'Dell' 'HP']
Actual values: [24988.32   41345.28   63882.72   42357.6    68184.0144]

selecting the specifications for the laptop you want to predict¶

In [37]:
# Define the specifications for the laptop you want to predict
spec = [[8, 128, 13.3, 1.37, 0]]

# Make predictions using the trained model
price_pred = best_model.predict(spec)

# Print the predicted price
print("Predicted Price:", price_pred[0])
Predicted Price: 66922.73427599993

Define the specifications for the laptop you want to predict based on budget, Mid-range, expensive¶

In [38]:
# Define the specifications for the laptop you want to predict
spec = [[16, 256, 13.3, 1.37, 0]]


price_pred = best_model.predict(spec)
print("Predicted Price:", price_pred[0])


if price_pred[0] <= data['Price'].quantile(0.33):
    category = 'budget'
elif price_pred[0] >= data['Price'].quantile(0.66):
    category = 'expensive'
else:
    category = 'Mid-range'

# Print the price category
print("Price Category:", category)
Predicted Price: 96809.43499199998
Price Category: expensive

Predictions with User Choice¶

In [39]:
import pandas as pd

# Define the feature names
feature_names = ['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']

# Initialize an empty dictionary to store user-provided values
spec = {}

# Ask the user to provide values for each feature
for feature in feature_names:
    value = input("Enter value for {}: ".format(feature))
    spec[feature] = value

# Convert the user-provided values to the desired format
ram = int(spec['Ram'])
memory = int(spec['Memory'])
inches = float(spec['Inches'])
weight = float(spec['Weight'])
type_name = spec['TypeName']

# Construct the spec list using the user-provided values
spec = [[ram, memory, inches, weight, type_name]]

# Make predictions using the trained model
price_pred = best_model.predict(spec)

# Print the predicted price
print("Predicted Price:", price_pred[0])

# Determine the price category
if price_pred[0] <= data['Price'].quantile(0.33):
    category = 'budget'
elif price_pred[0] >= data['Price'].quantile(0.66):
    category = 'expensive'
else:
    category = 'Mid-range'

# Print the price category
print("Price Category:", category)
Predicted Price: 47118.88195200001
Price Category: Mid-range